VISU

Import des packages :

library(tidyverse)
library(knitr)
library(rmarkdown)
library(markdown)
library(data.table)
library(plotly)
library(viridis)
library(hrbrthemes)
library(lubridate)
library(highcharter)
library(shiny)
library(shinydashboard)

Chargement des jeux de données :

df_anime <- fread(file='datasets/anime_filtered.csv')
df_users <- fread(file='datasets/users_filtered.csv')
# Colonne age des utilisateurs
users <- df_users[df_users$gender %in% c('Female', 'Male') & !is.null(df_users$birth_date)] %>% 
  select(username, gender, user_completed, user_days_spent_watching, birth_date)

users$age <- as.period(interval(start = users$birth_date, end = as.Date(now())))$year

# Dataset et nouvelles colonnes pour le graphe de densité par décennie
date_data <- df_anime %>% 
  rowwise() %>% 
  mutate(aired = str_extract_all(aired, "(None|[0-9]*-[0-9]+-[0-9]+)")[[1]][1]) %>% 
  ungroup() %>%
  mutate(
    year = as.integer(if_else(nchar(aired) == 10, substr(aired, 1, 4), NULL)),
    month = as.integer(if_else(nchar(aired) == 10, substr(aired, 6, 7), NULL)),
    season = if_else(month %in% c(1:3), 'Winter',
                     if_else(month %in% c(4:6), 'Spring',
                             if_else(month %in% c(7:9), 'Summer', 
                                     if_else(month %in% c(10:12), 'Autumn', NULL)))),
    year_season = paste(year, season),
    decade = as.factor(if_else(!is.na(year), 
                               paste(substr(year, 1, 3), "0's", sep = ''), NULL))
  ) %>% 
  arrange(year, month)

# Valeur médiane pour les boxplots
median_val <- df_anime %>% 
  select(scored_by, score) %>% 
  filter(scored_by > 99) %>% 
  summarize(median_sc = median(score))

# Extraire durée en minutes
duration_data <- df_anime %>% 
  mutate(h = if_else(str_detect(duration, "[0-9]*(?= hr.)"),
                         as.numeric(str_extract(duration, "[0-9]*(?= hr.)")), 0),
         m = if_else(str_detect(duration, "[0-9]*(?= min.)"),
                           as.numeric(str_extract(duration, "[0-9]*(?= min.)")), 0),
         s = if_else(str_detect(duration, "[0-9]*(?= sec.)"),
                           as.numeric(str_extract(duration, "[0-9]*(?= sec.)")), 0),
         duration = h*60 + m + s/60,
         duration = na_if(duration, 0))

OVERVIEW

line_nb_year <- date_data %>% 
  filter(!is.na(year)) %>% 
  group_by("Year" = year) %>% 
  summarise(Freq = n()) %>%
  ggplot(aes(x = Year, y = Freq)) +
  geom_line(color = 'red', size = 1) +
  theme_ipsum() +
  theme(axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14)) +
  scale_x_continuous(
    expand = c(0, 0),
    limits = c(1910, 2017),
    breaks = seq(1910, 2017, 25)
  ) +
  scale_y_continuous(
    breaks = seq(0, 1000, 250),
    limits = c(0, 1000)
  ) +
  ggtitle("Number of anime per year")

line_nb_year <- ggplotly(line_nb_year)
line_nb_year
data_pie_type <- df_anime %>% 
  filter(type != "Unknown") %>% 
  group_by("Type" = type) %>% 
  summarise(Freq = n())
    
highchart() %>% 
  hc_add_series( data_pie_type, hcaes(x = Type,y = Freq, color = Type), type = "pie") %>%
  hc_tooltip(borderWidth = 1.5, headerFormat = "",
  pointFormat = paste("<b>Type: {point.Type}</b> ({point.percentage:.1f}%)<br><b>Count:</b> {point.y}"))
score_source <- df_anime %>% 
  filter(airing == F & scored_by > 99) %>%
  select(score, scored_by, source) %>%
  ggplot(aes(x = source, y = score)) +
  geom_boxplot(fill = 'deepskyblue') + 
  geom_hline(aes(yintercept = median_val[[1]], linetype = 'median')) +
  scale_linetype_manual(name = '', values = c(median = 'dashed')) +
  coord_flip() + 
  theme_ipsum() +
  theme(
    axis.title.x = element_text(size=14),
    axis.title.y = element_blank(),
  ) +
  scale_y_continuous(limits = c(0, 10)) +
  ggtitle('Score Distribution by Source')

score_source <- ggplotly(score_source)
score_source
score_type <- df_anime %>% 
  filter(airing == F & scored_by > 99) %>%
  select(score, scored_by, type) %>%
  ggplot(aes(x = type, y = score)) +
  geom_boxplot(fill = 'deepskyblue') + 
  geom_hline(aes(yintercept = median_val[[1]], linetype = 'median')) +
  scale_linetype_manual(name = '', values = c(median = 'dashed')) +
  coord_flip() + 
  theme_ipsum() +
  theme(
    axis.title.x = element_text(size = 14),
    axis.title.y = element_blank(),
  ) +
  scale_y_continuous(limits = c(0, 10)) +
  ggtitle('Score Distribution by Type')

score_type <- ggplotly(score_type)
score_type
rs_score_notes <- df_anime %>%
  filter(airing == F & scored_by > 99) %>%
  ggplot(aes(x = scored_by, y = score)) +
  stat_bin_hex(bins = 50) +
  theme_ipsum() +
  scale_fill_viridis() +
  stat_smooth(
    method = 'lm',
    color = 'red',
    formula = y ~ log(x)
  ) +
  theme(
    axis.title.x = element_text(size=14),
    axis.title.y = element_text(size=14)
  ) +
  scale_y_continuous(
    limits = c(2, 9.5),
    breaks = seq(2, 9.5)
  ) +
  labs(title = 'Relationship between Score and Scored_by', x = "Scored by # people", y = "Score")

rs_score_notes <- ggplotly(rs_score_notes)
rs_score_notes

TV ANIME

score_year_tv <- date_data %>% 
  filter(score != 0, scored_by > 10, type == 'TV') %>% 
  group_by("Year" = year) %>% 
  summarise(avg = round(mean(score, na.rm = T), 2)) %>%
  ggplot(aes(x = Year, y = avg)) +
  geom_line(color = 'red', size = 1) +
  theme_ipsum() +
  theme(axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14)) +
  scale_x_continuous(
    expand = c(0, 0),
    limits = c(1960, 2017),
    breaks = seq(1960, 2017, 10)
  ) +
  scale_y_continuous(
    breaks = seq(6, 7.5, 0.2),
    limits = c(6, 7.5)
  ) +
  labs(title = "Average score per year", y = "Average score")

score_year_tv <- ggplotly(score_year_tv)
score_year_tv
ratings_tv_decade <- date_data %>%
  filter(!is.na(decade), type == "TV") %>% 
  group_by("Decade" = decade, rating, .drop = F) %>% 
  summarise(Freq = n()) %>%
  ggplot(aes(x = Decade, y = Freq, group = rating, shape = rating, color = rating)) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  theme_ipsum() +
  theme(axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14)) +
  scale_y_continuous(
    breaks = seq(0, 1250, 250),
    limits = c(0, 1250)
  ) +
  ggtitle("Ratings evolution during decades")

ratings_tv_decade <- ggplotly(ratings_tv_decade)
ratings_tv_decade
bbl_chart <- df_anime[df_anime$popularity != 0] %>% 
  arrange(rank) %>% head(n = 100) %>%
  select(popularity, rank, title, scored_by, favorites, score) %>%
  filter(popularity <= 100) %>%
  mutate(point = (as.numeric(scored_by) * as.numeric(favorites) * as.numeric(score)) / 10^10) %>% 
  ggplot(aes(x = rank, y = popularity, size = point, color = popularity, text = title)) +
  geom_point(alpha = 0.7) +
  scale_size(range = c(1.4, 19)) +
  scale_color_viridis() +
  theme_ipsum() +
  theme(
    axis.title.x = element_text(size = 14),
    axis.title.y = element_text(size = 14)
  ) +
  scale_x_continuous(
    limits = c(0, 100),
    breaks = seq(0, 100, 10)
  ) +
  scale_y_continuous(
    limits = c(-5, 100),
    breaks = seq(0, 100, 25)
  ) +
  labs(title='TOP 100 Anime by rank and popularity', x = 'Rank', y = 'Popularity')

bbl_chart <- ggplotly(bbl_chart, tooltip = c('rank', 'popularity'))
bbl_chart
point_plot_pop <- df_anime[df_anime$popularity != 0] %>%
  arrange(popularity) %>% head(n = 100) %>% 
  select(title, popularity, rank) %>%
  ggplot(aes(x = popularity, y=rank)) +
  geom_line(color = 'black') +
  geom_point(size = 2, color = 'red') +
  theme_ipsum() +
  theme(axis.title.x = element_text(size=14),
        axis.title.y = element_text(size=14)) +
  scale_x_continuous(
    expand = c(0, 0),
    limits = c(-1, 101),
    breaks = seq(0, 100, 5)
  ) +
  scale_y_continuous(
    breaks = seq(0, 3000, 500),
    limits = c(0, 3000)
  ) +
  ggtitle('Top 100 anime with their rank score')
  
point_plot_pop <- ggplotly(point_plot_pop, tooltip=c('rank', 'popularity'))
point_plot_pop
score_density_dec <- date_data %>%
  select(decade, type, score) %>%
  filter(!is.na(decade), type == 'TV', score != 0) %>% 
  ggplot(aes(score, group = decade, fill = decade)) +
  geom_density(adjust = 1.25, alpha = .75) +
  theme_ipsum() +
  theme(
    legend.position='top',
    axis.title.x = element_text(size=14),
    axis.title.y = element_text(size=14),
  ) +
  scale_x_continuous(
    limits = c(0, 10),
    breaks = seq(0, 10, 2.5)
  ) +
  scale_y_continuous(
    expand = c(0, 0),
    limits = c(0, 1),
    breaks = seq(0, 1, 0.25)
  ) +
  ggtitle('TV anime score density by decade')

score_density_dec <- ggplotly(score_density_dec)
score_density_dec

MOVIES

score_year_movie <- date_data %>% 
  filter(score != 0, scored_by > 10, type == 'Movie') %>% 
  group_by("Year" = year) %>% 
  summarise(avg = round(mean(score, na.rm = T), 2)) %>%
  ggplot(aes(x = Year, y = avg)) +
  geom_line(color = 'red', size = 1) +
  theme_ipsum() +
  theme(axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14)) +
  scale_x_continuous(
    expand = c(0, 0),
    limits = c(1930, 2017),
    breaks = seq(1930, 2017, 25)
  ) +
  scale_y_continuous(
    breaks = seq(4.5, 7.5, 0.5),
    limits = c(4.5, 7.5)
  ) +
  labs(title = "Average score per year", y = "Average score")


score_year_movie <- ggplotly(score_year_movie)
score_year_movie
ratings_movie_decade <- date_data %>%
  filter(!is.na(decade), type == "Movie") %>% 
  group_by("Decade" = decade, rating, .drop = F) %>% 
  summarise(Freq = n()) %>%
  ggplot(aes(x = Decade, y = Freq, group = rating, shape = rating, color = rating)) +
  geom_line(size = 1) +
  geom_point(size = 2) +
  theme_ipsum() +
  theme(axis.title.x = element_text(size = 14),
        axis.title.y = element_text(size = 14)) +
  scale_y_continuous(
    breaks = seq(0, 300, 50),
    limits = c(0, 300)
  ) +
  ggtitle("Ratings evolution during decades")

ratings_movie_decade <- ggplotly(ratings_movie_decade)
ratings_movie_decade
score_density_movie <- date_data %>% 
  filter(!is.na(decade), type == 'Movie', score != 0) %>% 
  select(decade, type, score) %>% 
  ggplot(aes(score, group = decade, fill = decade)) +
  geom_density(adjust = 1.25, alpha = .7) +
  theme_ipsum() +
  theme(
    legend.position='top',
    axis.title.x = element_text(size=14),
    axis.title.y = element_text(size=14),
  ) +
  scale_x_continuous(
    limits = c(0, 10),
    breaks = seq(0, 10, 2.5)
  ) +
  scale_y_continuous(
    expand = c(0, 0),
    limits = c(0, 1),
    breaks = seq(0, 1, 0.25)
  ) +
  ggtitle("Movies score density by decade")

score_density_movie <- ggplotly(score_density_movie)
score_density_movie

USERS

box_gender_age <- users %>% 
  ggplot(aes(x = gender, y = age, fill = gender)) +
  geom_boxplot() +
  theme_ipsum() +
  theme(
    legend.position='none',
    axis.title.x = element_text(size=14),
    axis.title.y = element_text(size=14)
  ) +
  scale_y_continuous(
    expand = c(0, 0),
    limits = c(-1, 60),
    breaks = seq(0, 60, 5)
  ) +
  ylab("Age")

box_gender_age <- ggplotly(box_gender_age)
box_gender_spent <- users %>% 
  ggplot(aes(x = gender, y = user_days_spent_watching, fill = gender)) +
  geom_boxplot() +
  theme_ipsum() +
  theme(
    legend.position = 'none',
    axis.title.x = element_text(size=14),
    axis.title.y = element_text(size=14)
  ) +
  scale_y_continuous(
    expand = c(0, 0),
    limits = c(-1, 301),
    breaks = seq(0, 300, 50)
  ) +
  labs(title = "Users' age and days spent by gender", y = "Time spent (in days)")

box_gender_spent <- ggplotly(box_gender_spent)
subplot <- subplot(box_gender_age, box_gender_spent, nrows=1, titleY = T, margin = 0.07)
subplot
scatter_age <- users %>%
  ggplot(aes(x = age, y = user_days_spent_watching, color = gender)) +
  stat_bin_hex(bins = 75, alpha = 0.6)  +
  scale_color_manual(values= c('red', 'blue')) +
  theme_ipsum() +
  theme(
    legend.position = 'none',
    axis.title.x = element_text(size=14),
    axis.title.y = element_text(size=14),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank()
  ) +
  scale_x_continuous(
    expand = c(0, 0),
    limits = c(-1, 81),
    breaks = seq(0, 80, 5)
  ) +
  scale_y_continuous(
    expand = c(0, 0),
    limits = c(-1, 1001),
    breaks = seq(0, 1000, 200)
  ) +
  labs(title = "Anime watchers' age", x = "Age", y = "Time spent (in days)")
  
scatter_age <- ggplotly(scatter_age)
scatter_age